Import data

HateCrime_df=
   read_csv("./data/HateCrimes.csv") 
## Parsed with column specification:
## cols(
##   state = col_character(),
##   unemployment = col_character(),
##   urbanization = col_character(),
##   median_household_income = col_double(),
##   perc_population_with_high_school_degree = col_double(),
##   perc_non_citizen = col_double(),
##   gini_index = col_double(),
##   perc_non_white = col_double(),
##   hate_crimes_per_100k_splc = col_character()
## )
#Noting we do not have hatecrime stats for Hawaii,North Dakota,South Dakota,and Wyoming

Data exploration: descriptive statistics and visualization.

Explore the distribution of the outcome and consider potential transformations (if the case).

For analysis purpose, I delelte rows that do not have hate-crime record, and convert hate crime data to numerical value

HateCrime_df =
  HateCrime_df %>% 
  filter(hate_crimes_per_100k_splc != "N/A") %>%
    na.omit() %>%
  mutate(hate_crimes_per_100k_splc = as.numeric(hate_crimes_per_100k_splc))
Explore the distribution of the outcome
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
#HateCrime_df %>% 
#  filter(hate_crimes_per_100k_splc != "N/A") %>%
 # mutate(hate_crimes_per_100k_splc = as.numeric(hate_crimes_per_100k_splc)) %>% 
 # ggplot(aes(x=hate_crimes_per_100k_splc))+
 # geom_boxplot()+
 # coord_flip()


HateCrime_df %>% 
  mutate(text_label = str_c(" hate crime rate per 100,000 population: ", hate_crimes_per_100k_splc, "\nState: ", state)) %>% 
  plot_ly(y = ~hate_crimes_per_100k_splc, type = "box", colors = "viridis",text = ~text_label)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
HateCrime_df %>% 
  mutate(state = fct_reorder(state, hate_crimes_per_100k_splc)) %>% 
  ggplot(aes(x=hate_crimes_per_100k_splc)) +
  geom_histogram(fill="darkorange2") +
  labs(x = "hate crime rate per 100,000 population", y = "frequency")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# plot_ly(y = ~price, color = ~neighbourhood, type = "box", colors = "viridis")
HateCrime_df %>% 
  mutate(state = fct_reorder(state, hate_crimes_per_100k_splc)) %>% 
  ggplot(aes(x=state,y=hate_crimes_per_100k_splc,fill=state)) +
  geom_bar( stat = "identity")+
   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  labs(x = "state", y = "hate crime rate per 100,000 population")+
  theme(legend.position = "none")

summary(HateCrime_df$hate_crimes_per_100k_splc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.06906 0.14374 0.22620 0.30243 0.35062 1.52230
  • From the boxplot and histogram we see that the distribution of hate crime rate data is skewed to the right, with a couple outliers containing much higher hate crime rate than the rest of data.
  • According to the summary and boxplot, we see most our hate crime rate data are between 0.14271(q1) and 0.35693(q3) interval with mean of 0.30409 and median of 0.22620. The reason why we have higher mean than median is also because mean is influence by the outliers that have extraordinary high hate crime.
  • Specifically, from the boxplot and bar graph we see two outliers are identified:
    • record from District of Columbia with hate crime rate of 1.522
    • record from Oregon with hate crime rate of 0.8328
consider potential transformations (if the case).
Log Transformation
log=
HateCrime_df %>% 
  mutate(state = fct_reorder(state, hate_crimes_per_100k_splc),
       hate_crimes_per_100k_splc = log(hate_crimes_per_100k_splc)  ) %>% 
  ggplot(aes(x=hate_crimes_per_100k_splc)) +
  geom_histogram(fill="firebrick4") +
  labs(title= "log transformation",x = "log(hate crime rate per 100,000 population)", y = "frequency")
square root Transformation
squre_root=
HateCrime_df %>% 
  mutate(state = fct_reorder(state, hate_crimes_per_100k_splc),
       hate_crimes_per_100k_splc = (hate_crimes_per_100k_splc)^(0.5)  ) %>% 
  ggplot(aes(x=hate_crimes_per_100k_splc)) +
  geom_histogram(fill="firebrick4") +
  labs(title= "square root transformation",x = "sqrt(hate crime rate per 100,000 population)", y = "frequency")
cube root Transformation
cube_root=
HateCrime_df %>% 
  mutate(state = fct_reorder(state, hate_crimes_per_100k_splc),
       hate_crimes_per_100k_splc = (hate_crimes_per_100k_splc)^(1/3)  ) %>% 
  ggplot(aes(x=hate_crimes_per_100k_splc)) +
  geom_histogram(fill="firebrick4") +
  labs(title= "cube root transformation",x = "cube_root(hate crime rate per 100,000 population)", y = "frequency")
(log+squre_root)/cube_root
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  • comment:Just by looking at the histogram distribution, seems like log transformation is the best transformation to convert the current skewed distribution into a relatively normal one. We might need to perform boxcox and checking the diagnostic plot to be sure.

Model Building.

visualizing Correlation

HateCrime_df%>%
  select(-state,-unemployment,-urbanization) %>%
pairs()

HateCrime_df%>%
  select(-state,-unemployment,-urbanization) %>%
cor()
##                                         median_household_income
## median_household_income                              1.00000000
## perc_population_with_high_school_degree              0.65113832
## perc_non_citizen                                     0.30173941
## gini_index                                          -0.12952158
## perc_non_white                                       0.03905399
## hate_crimes_per_100k_splc                            0.34378921
##                                         perc_population_with_high_school_degree
## median_household_income                                               0.6511383
## perc_population_with_high_school_degree                               1.0000000
## perc_non_citizen                                                     -0.2621288
## gini_index                                                           -0.5371591
## perc_non_white                                                       -0.4958932
## hate_crimes_per_100k_splc                                             0.2628198
##                                         perc_non_citizen gini_index
## median_household_income                        0.3017394 -0.1295216
## perc_population_with_high_school_degree       -0.2621288 -0.5371591
## perc_non_citizen                               1.0000000  0.4798976
## gini_index                                     0.4798976  1.0000000
## perc_non_white                                 0.7526102  0.5484035
## hate_crimes_per_100k_splc                      0.2435066  0.3805028
##                                         perc_non_white
## median_household_income                     0.03905399
## perc_population_with_high_school_degree    -0.49589321
## perc_non_citizen                            0.75261020
## gini_index                                  0.54840351
## perc_non_white                              1.00000000
## hate_crimes_per_100k_splc                   0.11116503
##                                         hate_crimes_per_100k_splc
## median_household_income                                 0.3437892
## perc_population_with_high_school_degree                 0.2628198
## perc_non_citizen                                        0.2435066
## gini_index                                              0.3805028
## perc_non_white                                          0.1111650
## hate_crimes_per_100k_splc                               1.0000000
  • comment:
    • Looking at the correlation matrix, specifically on the correlation between hate crime rate and the rest of covariates we see that gini_index, which measures income inequality, does have the strongest correlation with hate crime rate, similarly, median household income have relatively high correlation with hate crime rate as well, indicating the difference in income does have strong influence on hate crime rate.
    • Note about potential collinearity: From the correlation matrix we see that percent noncitizen population and percent nonwhite population have have correlation of -.7526. Similarly, another high correlation of 0.65 is observed between median household income and percent of adults 25 and older with at least a high school degree
    • All these observation are made without removing any potential outlier and influential point

Fit model

Fit model with every term
mult_fit <- lm(hate_crimes_per_100k_splc ~ median_household_income + perc_non_white + perc_non_citizen + perc_population_with_high_school_degree + gini_index + unemployment + urbanization, data=HateCrime_df)
summary(mult_fit)
## 
## Call:
## lm(formula = hate_crimes_per_100k_splc ~ median_household_income + 
##     perc_non_white + perc_non_citizen + perc_population_with_high_school_degree + 
##     gini_index + unemployment + urbanization, data = HateCrime_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.36552 -0.10314 -0.01316  0.09731  0.51389 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                             -8.296e+00  1.908e+00  -4.349 0.000103
## median_household_income                 -1.504e-06  5.961e-06  -0.252 0.802193
## perc_non_white                          -5.842e-03  3.673e-01  -0.016 0.987396
## perc_non_citizen                         1.233e+00  1.877e+00   0.657 0.515332
## perc_population_with_high_school_degree  5.382e+00  1.835e+00   2.933 0.005735
## gini_index                               8.624e+00  1.973e+00   4.370 9.67e-05
## unemploymentlow                          1.307e-02  7.173e-02   0.182 0.856425
## urbanizationlow                          3.309e-02  8.475e-02   0.390 0.698475
##                                            
## (Intercept)                             ***
## median_household_income                    
## perc_non_white                             
## perc_non_citizen                           
## perc_population_with_high_school_degree ** 
## gini_index                              ***
## unemploymentlow                            
## urbanizationlow                            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2014 on 37 degrees of freedom
## Multiple R-squared:  0.461,  Adjusted R-squared:  0.3591 
## F-statistic: 4.521 on 7 and 37 DF,  p-value: 0.001007
Performing Box-Cox Transformation
boxcox(mult_fit)

- From Boxcox we also see the lambda value is close to 0 which means a log-transformation is suggested in this case.

Comparing diagnosed plot before and after performing box-cox
par(mfrow=c(2,2))
plot(mult_fit)

Performing log transformation
mult_fit_log <- lm(log(hate_crimes_per_100k_splc) ~ median_household_income + perc_non_white + perc_non_citizen + perc_population_with_high_school_degree + gini_index + unemployment + urbanization, data=HateCrime_df)
summary(mult_fit_log)
## 
## Call:
## lm(formula = log(hate_crimes_per_100k_splc) ~ median_household_income + 
##     perc_non_white + perc_non_citizen + perc_population_with_high_school_degree + 
##     gini_index + unemployment + urbanization, data = HateCrime_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.28845 -0.41144  0.01898  0.31334  1.13022 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                             -1.857e+01  5.553e+00  -3.344  0.00190
## median_household_income                 -4.732e-06  1.735e-05  -0.273  0.78658
## perc_non_white                          -1.232e-01  1.069e+00  -0.115  0.90887
## perc_non_citizen                         1.168e+00  5.464e+00   0.214  0.83189
## perc_population_with_high_school_degree  1.121e+01  5.341e+00   2.098  0.04275
## gini_index                               1.670e+01  5.744e+00   2.908  0.00611
## unemploymentlow                          2.179e-01  2.088e-01   1.043  0.30353
## urbanizationlow                         -9.885e-02  2.467e-01  -0.401  0.69092
##                                           
## (Intercept)                             **
## median_household_income                   
## perc_non_white                            
## perc_non_citizen                          
## perc_population_with_high_school_degree * 
## gini_index                              **
## unemploymentlow                           
## urbanizationlow                           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5862 on 37 degrees of freedom
## Multiple R-squared:  0.3146, Adjusted R-squared:  0.1849 
## F-statistic: 2.426 on 7 and 37 DF,  p-value: 0.03768
par(mfrow=c(2,2))
plot(mult_fit_log)

  • comment By comparing the diagnostic plot we see that after applying log transformation, we were able to reduce curvature line in the residual versus fitted plot and sqrt(standardized residual) versus fitted value plot, the horizontal band is also closer to zero in the residuals vs fitted value plot, which means log transformation does enable us to have more equally spread error variance. Log transformation also reduce the presence of heavy tails in QQplot in which indicates residuals seem to follow normal distribution in the current model.Finally, log transformation reduce the effect of influence point as indicate in the Residuals vs Leverage plot. Before transformation, one point is outside of the dashed line 1, which means we have an influential point that have cook’s distance larger than 1. After log transformation, all observation are within the dashed line meaning no influential point is shown in the current model.
step(mult_fit_log, direction='backward')
## Start:  AIC=-40.88
## log(hate_crimes_per_100k_splc) ~ median_household_income + perc_non_white + 
##     perc_non_citizen + perc_population_with_high_school_degree + 
##     gini_index + unemployment + urbanization
## 
##                                           Df Sum of Sq    RSS     AIC
## - perc_non_white                           1   0.00456 12.719 -42.859
## - perc_non_citizen                         1   0.01570 12.730 -42.820
## - median_household_income                  1   0.02556 12.740 -42.785
## - urbanization                             1   0.05519 12.770 -42.680
## - unemployment                             1   0.37413 13.089 -41.570
## <none>                                                 12.715 -40.875
## - perc_population_with_high_school_degree  1   1.51318 14.228 -37.815
## - gini_index                               1   2.90660 15.621 -33.611
## 
## Step:  AIC=-42.86
## log(hate_crimes_per_100k_splc) ~ median_household_income + perc_non_citizen + 
##     perc_population_with_high_school_degree + gini_index + unemployment + 
##     urbanization
## 
##                                           Df Sum of Sq    RSS     AIC
## - perc_non_citizen                         1   0.01114 12.730 -44.820
## - median_household_income                  1   0.02946 12.749 -44.755
## - urbanization                             1   0.05718 12.777 -44.657
## - unemployment                             1   0.41699 13.136 -43.408
## <none>                                                 12.719 -42.859
## - perc_population_with_high_school_degree  1   1.73309 14.452 -39.111
## - gini_index                               1   2.90620 15.626 -35.599
## 
## Step:  AIC=-44.82
## log(hate_crimes_per_100k_splc) ~ median_household_income + perc_population_with_high_school_degree + 
##     gini_index + unemployment + urbanization
## 
##                                           Df Sum of Sq    RSS     AIC
## - median_household_income                  1   0.01910 12.750 -46.752
## - urbanization                             1   0.11092 12.841 -46.429
## - unemployment                             1   0.41466 13.145 -45.377
## <none>                                                 12.730 -44.820
## - perc_population_with_high_school_degree  1   1.92883 14.659 -40.471
## - gini_index                               1   3.00737 15.738 -37.277
## 
## Step:  AIC=-46.75
## log(hate_crimes_per_100k_splc) ~ perc_population_with_high_school_degree + 
##     gini_index + unemployment + urbanization
## 
##                                           Df Sum of Sq    RSS     AIC
## - urbanization                             1   0.09183 12.841 -48.429
## - unemployment                             1   0.40424 13.154 -47.348
## <none>                                                 12.750 -46.752
## - gini_index                               1   3.02492 15.774 -39.172
## - perc_population_with_high_school_degree  1   3.15688 15.906 -38.797
## 
## Step:  AIC=-48.43
## log(hate_crimes_per_100k_splc) ~ perc_population_with_high_school_degree + 
##     gini_index + unemployment
## 
##                                           Df Sum of Sq    RSS     AIC
## - unemployment                             1    0.3655 13.207 -49.166
## <none>                                                 12.841 -48.429
## - perc_population_with_high_school_degree  1    3.3459 16.187 -40.010
## - gini_index                               1    4.0555 16.897 -38.079
## 
## Step:  AIC=-49.17
## log(hate_crimes_per_100k_splc) ~ perc_population_with_high_school_degree + 
##     gini_index
## 
##                                           Df Sum of Sq    RSS     AIC
## <none>                                                 13.207 -49.166
## - gini_index                               1    3.7171 16.924 -40.007
## - perc_population_with_high_school_degree  1    4.4569 17.664 -38.081
## 
## Call:
## lm(formula = log(hate_crimes_per_100k_splc) ~ perc_population_with_high_school_degree + 
##     gini_index, data = HateCrime_df)
## 
## Coefficients:
##                             (Intercept)  
##                                  -18.95  
## perc_population_with_high_school_degree  
##                                   11.55  
##                              gini_index  
##                                   16.49
stepwised_multi_fit = lm( log(hate_crimes_per_100k_splc) ~ perc_population_with_high_school_degree + 
    gini_index, data = HateCrime_df)

summary(stepwised_multi_fit)
## 
## Call:
## lm(formula = log(hate_crimes_per_100k_splc) ~ perc_population_with_high_school_degree + 
##     gini_index, data = HateCrime_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.34787 -0.39659 -0.00387  0.44892  1.06723 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                              -18.947      4.254  -4.454 6.14e-05
## perc_population_with_high_school_degree   11.554      3.069   3.765 0.000512
## gini_index                                16.486      4.795   3.438 0.001334
##                                            
## (Intercept)                             ***
## perc_population_with_high_school_degree ***
## gini_index                              ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5608 on 42 degrees of freedom
## Multiple R-squared:  0.288,  Adjusted R-squared:  0.2541 
## F-statistic: 8.496 on 2 and 42 DF,  p-value: 0.0007974